import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import string
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as ex
import plotly.graph_objs as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
pyo.init_notebook_mode()
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from wordcloud import WordCloud,STOPWORDS
from pandas.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from nltk.util import ngrams
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import random
plt.rc('figure',figsize=(17,13))
[nltk_data] Downloading package vader_lexicon to [nltk_data] C:\Users\sshir\AppData\Roaming\nltk_data... [nltk_data] Package vader_lexicon is already up-to-date!
data = pd.read_csv(r'C:\vaccination_tweets.csv')
data.head(3)
| id | user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | retweets | favorites | is_retweet | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.340540e+18 | Rachel Roh | La Crescenta-Montrose, CA | Aggregator of Asian American news; scanning di... | 4/8/2009 17:52 | 405 | 1692 | 3247 | False | 12/20/2020 6:06 | Same folks said daikon paste could treat a cyt... | ['PfizerBioNTech'] | Twitter for Android | 0 | 0 | False |
| 1 | 1.338160e+18 | Albert Fong | San Francisco, CA | Marketing dude, tech geek, heavy metal & '80s ... | 9/21/2009 15:27 | 834 | 666 | 178 | False | 12/13/2020 16:27 | While the world has been on the wrong side of ... | NaN | Twitter Web App | 1 | 1 | False |
| 2 | 1.337860e+18 | eli🇱🇹🇪🇺👌 | Your Bed | heil, hydra 🖐☺ | 6/25/2020 23:30 | 10 | 88 | 155 | False | 12/12/2020 20:33 | #coronavirus #SputnikV #AstraZeneca #PfizerBio... | ['coronavirus', 'SputnikV', 'AstraZeneca', 'Pf... | Twitter for Android | 0 | 0 | False |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7783 entries, 0 to 7782 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 7783 non-null float64 1 user_name 7783 non-null object 2 user_location 6211 non-null object 3 user_description 7292 non-null object 4 user_created 7783 non-null object 5 user_followers 7783 non-null int64 6 user_friends 7783 non-null int64 7 user_favourites 7783 non-null int64 8 user_verified 7783 non-null bool 9 date 7783 non-null object 10 text 7783 non-null object 11 hashtags 5886 non-null object 12 source 7782 non-null object 13 retweets 7783 non-null int64 14 favorites 7783 non-null int64 15 is_retweet 7783 non-null bool dtypes: bool(2), float64(1), int64(5), object(8) memory usage: 866.6+ KB
#Check the missing values for each column
data.isnull().sum()
id 0 user_name 0 user_location 1572 user_description 491 user_created 0 user_followers 0 user_friends 0 user_favourites 0 user_verified 0 date 0 text 0 hashtags 1897 source 1 retweets 0 favorites 0 is_retweet 0 dtype: int64
#Add a new column to count the number of text keywords used
data['Num_Keywords'] = data['text'].str.len()
#Top 20 categories with most keyword
data[['text','Num_Keywords']].sort_values('Num_Keywords',ascending = False).head(10)
| text | Num_Keywords | |
|---|---|---|
| 4789 | jus thinking’if looking & how big& wea... | 151 |
| 5026 | @yaffaesque @NewYorker Same questions that rea... | 148 |
| 627 | #bullshit-bingo propaganda for #Pfizer #Pfizer... | 148 |
| 7317 | @simchrison @Bryan13571 I know, I am German &a... | 148 |
| 2963 | @RosieBarton 🇨🇦 “has received & distribute... | 148 |
| 1246 | @PattyHajdu @GovCanHealth Vaccinate at your ow... | 148 |
| 4909 | Mrs @vonderleyen , please, let's have more Pfi... | 148 |
| 5898 | More #GoodNews from @bopanc & @DovLieber! ... | 148 |
| 3645 | Thanks Scientists. Thanks #NHS . Thanks Liver... | 148 |
| 3004 | 🇨🇦 has received & distributed over half a ... | 148 |
fig = plt.figure()
ax = fig.add_subplot(111)
x = data['Num_Keywords']
numBins = 100
ax.hist(x,numBins,color='green',alpha=0.7)
plt.show()
def clean(text):
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub(r'\s+', ' ', text, flags=re.I)
text = re.sub('\[.*?\]', '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
text = re.sub('<.*?>+', '', text)
return text
data['text'] = data['text'].apply(lambda x:clean(x))
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
scores=[]
for i in range(len(data['text'])):
score = analyser.polarity_scores(data['text'][i])
score=score['compound']
scores.append(score)
sentiment=[]
for i in scores:
if i>=0.05:
sentiment.append('Positive')
elif i<=(-0.05):
sentiment.append('Negative')
else:
sentiment.append('Neutral')
data['sentiment']=pd.Series(np.array(sentiment))
data.head()
| id | user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | retweets | favorites | is_retweet | Num_Keywords | sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.340540e+18 | Rachel Roh | La Crescenta-Montrose, CA | Aggregator of Asian American news; scanning di... | 4/8/2009 17:52 | 405 | 1692 | 3247 | False | 12/20/2020 6:06 | Same folks said daikon paste could treat a cyt... | ['PfizerBioNTech'] | Twitter for Android | 0 | 0 | False | 97 | Positive |
| 1 | 1.338160e+18 | Albert Fong | San Francisco, CA | Marketing dude, tech geek, heavy metal & '80s ... | 9/21/2009 15:27 | 834 | 666 | 178 | False | 12/13/2020 16:27 | While the world has been on the wrong side of ... | NaN | Twitter Web App | 1 | 1 | False | 140 | Negative |
| 2 | 1.337860e+18 | eli🇱🇹🇪🇺👌 | Your Bed | heil, hydra 🖐☺ | 6/25/2020 23:30 | 10 | 88 | 155 | False | 12/12/2020 20:33 | #coronavirus #SputnikV #AstraZeneca #PfizerBio... | ['coronavirus', 'SputnikV', 'AstraZeneca', 'Pf... | Twitter for Android | 0 | 0 | False | 140 | Positive |
| 3 | 1.337860e+18 | Charles Adler | Vancouver, BC - Canada | Hosting "CharlesAdlerTonight" Global News Radi... | 9/10/2008 11:28 | 49165 | 3933 | 21853 | True | 12/12/2020 20:23 | Facts are immutable, Senator, even when you're... | NaN | Twitter Web App | 446 | 2129 | False | 140 | Neutral |
| 4 | 1.337850e+18 | Citizen News Channel | NaN | Citizen News Channel bringing you an alternati... | 4/23/2020 17:58 | 152 | 580 | 1473 | False | 12/12/2020 20:17 | Explain to me again why we need a vaccine @Bor... | ['whereareallthesickpeople', 'PfizerBioNTech'] | Twitter for iPhone | 0 | 0 | False | 135 | Neutral |
sid = SIA()
data['sentiments'] = data['text'].apply(lambda x: sid.polarity_scores(' '.join(re.findall(r'\w+',x.lower()))))
data['Positive Sentiment'] = data['sentiments'].apply(lambda x: x['pos']+1*(10**-6))
data['Neutral Sentiment'] = data['sentiments'].apply(lambda x: x['neu']+1*(10**-6))
data['Negative Sentiment'] = data['sentiments'].apply(lambda x: x['neg']+1*(10**-6))
data.drop(columns=['sentiments'],inplace=True)
data.head()
| id | user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | ... | hashtags | source | retweets | favorites | is_retweet | Num_Keywords | sentiment | Positive Sentiment | Neutral Sentiment | Negative Sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.340540e+18 | Rachel Roh | La Crescenta-Montrose, CA | Aggregator of Asian American news; scanning di... | 4/8/2009 17:52 | 405 | 1692 | 3247 | False | 12/20/2020 6:06 | ... | ['PfizerBioNTech'] | Twitter for Android | 0 | 0 | False | 97 | Positive | 0.231001 | 0.769001 | 0.000001 |
| 1 | 1.338160e+18 | Albert Fong | San Francisco, CA | Marketing dude, tech geek, heavy metal & '80s ... | 9/21/2009 15:27 | 834 | 666 | 178 | False | 12/13/2020 16:27 | ... | NaN | Twitter Web App | 1 | 1 | False | 140 | Negative | 0.109001 | 0.766001 | 0.125001 |
| 2 | 1.337860e+18 | eli🇱🇹🇪🇺👌 | Your Bed | heil, hydra 🖐☺ | 6/25/2020 23:30 | 10 | 88 | 155 | False | 12/12/2020 20:33 | ... | ['coronavirus', 'SputnikV', 'AstraZeneca', 'Pf... | Twitter for Android | 0 | 0 | False | 140 | Positive | 0.154001 | 0.846001 | 0.000001 |
| 3 | 1.337860e+18 | Charles Adler | Vancouver, BC - Canada | Hosting "CharlesAdlerTonight" Global News Radi... | 9/10/2008 11:28 | 49165 | 3933 | 21853 | True | 12/12/2020 20:23 | ... | NaN | Twitter Web App | 446 | 2129 | False | 140 | Neutral | 0.000001 | 1.000001 | 0.000001 |
| 4 | 1.337850e+18 | Citizen News Channel | NaN | Citizen News Channel bringing you an alternati... | 4/23/2020 17:58 | 152 | 580 | 1473 | False | 12/12/2020 20:17 | ... | ['whereareallthesickpeople', 'PfizerBioNTech'] | Twitter for iPhone | 0 | 0 | False | 135 | Neutral | 0.000001 | 1.000001 | 0.000001 |
5 rows × 21 columns
import matplotlib.pyplot as plt
print("Number of rows per star rating:")
print(data['sentiment'].value_counts())
# Function to map stars to sentiment
def map_sentiment(label):
if label == 'Positive':
return 1
elif label == 'Negative':
return -1
else:
return 0
# Mapping sentiment into numeric
data['sentiment1'] = [ map_sentiment(x) for x in data['sentiment']]
# Plotting the sentiment distribution
plt.figure()
pd.value_counts(data['sentiment1']).plot.bar(title="Sentiment distribution in df")
plt.xlabel("Sentiment")
plt.ylabel("No. of rows in data")
plt.show()
Number of rows per star rating: Positive 3355 Neutral 3075 Negative 1353 Name: sentiment, dtype: int64
data = data.sort_values(by='date')
data['date'] = pd.to_datetime(data['date']).dt.date
data.head()
| id | user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | ... | source | retweets | favorites | is_retweet | Num_Keywords | sentiment | Positive Sentiment | Neutral Sentiment | Negative Sentiment | sentiment1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2126 | 1.344800e+18 | нσℓℓүαηη | From MI,VT,FL | #Wife👫#booknerd 📚#HospitalPharmacyVeteran🏩🤍#Mi... | 12/30/2015 22:35 | 768 | 2945 | 14014 | False | 2021-01-01 | ... | Twitter for iPad | 0 | 1 | False | 136 | Positive | 0.427001 | 0.573001 | 0.000001 | 1 |
| 2125 | 1.344800e+18 | Nick Ogami | Boise, ID | Medical Laboratory Scientist.Girl Dad. Steeler... | 10/28/2011 23:00 | 112 | 416 | 4067 | False | 2021-01-01 | ... | Twitter for iPhone | 0 | 2 | False | 140 | Positive | 0.140001 | 0.860001 | 0.000001 | 1 |
| 2124 | 1.344800e+18 | Amanda Adams MD | Walking the dogs | Family Physician 🇨🇦 Health Advocacy Special In... | 6/2/2012 1:34 | 592 | 774 | 6665 | False | 2021-01-01 | ... | Twitter for iPhone | 0 | 1 | False | 126 | Positive | 0.324001 | 0.676001 | 0.000001 | 1 |
| 2082 | 1.344950e+18 | Laura Heald | NaN | Nursing Associate👩⚕️ SUFC season ticket holde... | 6/9/2013 17:31 | 42 | 295 | 1360 | False | 2021-01-01 | ... | Twitter for iPhone | 0 | 2 | False | 133 | Positive | 0.253001 | 0.588001 | 0.158001 | 1 |
| 2081 | 1.344950e+18 | IANS Tweets | New Delhi | India's largest independent News Agency | 5/8/2010 13:21 | 60006 | 45 | 118 | True | 2021-01-01 | ... | Twitter Web App | 2 | 5 | False | 140 | Neutral | 0.000001 | 1.000001 | 0.000001 | 0 |
5 rows × 22 columns
Most_Positive = data[data['Positive Sentiment'].between(0.4,1)]
Most_Negative = data[data['Negative Sentiment'].between(0.25,1)]
Most_Positive_text = ' '.join(Most_Positive.text)
Most_Negative_text = ' '.join(Most_Negative.text)
pwc = WordCloud(width=600,height=400,collocations = False,background_color='white').generate(Most_Positive_text)
nwc = WordCloud(width=600,height=400,collocations = False,background_color='white').generate(Most_Negative_text)
plt.subplot(1,2,1)
plt.title('Common Words Among Most Positive Tweets',fontsize=16,fontweight='bold')
plt.imshow(pwc)
plt.axis('off')
plt.subplot(1,2,2)
plt.title('Common Words Among Most Negative Tweets',fontsize=16,fontweight='bold')
plt.imshow(nwc)
plt.axis('off')
plt.show()
b_date_count = data.groupby(by='date').count().reset_index()
b_date_count = b_date_count.rename(columns={'id':'Tweets per day'})
fig = ex.line(b_date_count,x='date',y='Tweets per day')
# fig.add_annotation(x=b_date_mean['date'].values[15], y=.024,
# text=r"Start Of Incline",
# showarrow=True,
# arrowhead=6,
# yshift=10)
# fig.add_shape(type="line",
# x0=b_date_count['day'].values[0], y0=b_date_count['Negative Sentiment'].mean(), x1=b_date_count['day'].values[-1], y1=b_date_count['Negative Sentiment'].mean(),
# line=dict(
# color="Red",
# width=2,
# dash="dashdot",
# ),
# name='Mean',
# )
fig.add_shape(type="line",
x0=b_date_count['date'].values[0], y0=b_date_count['sentiment'].mean(), x1=b_date_count['date'].values[-1], y1=b_date_count['sentiment'].mean(),
line=dict(
color="Red",
width=2,
dash="dashdot",
),
name='Mean',
)
fig.update_traces(mode="markers+lines")
fig.update_layout(hovermode="x unified")
data_p=data[data['sentiment']=='Positive']
import datetime
b_date_count_p = data_p.groupby(by='date').count().reset_index()
b_date_count_p = b_date_count_p.rename(columns={'id':'Positive tweets per day'})
fig = ex.line(b_date_count_p,x='date',y='Positive tweets per day')
fig.update_traces(mode="markers+lines")
fig.update_layout(hovermode="x unified")
fig.add_shape(type="line",
x0=b_date_count_p['date'].values[0], y0=b_date_count_p['Positive Sentiment'].mean(), x1=b_date_count_p['date'].values[-1], y1=b_date_count_p['Positive Sentiment'].mean(),
line=dict(
color="Red",
width=2,
dash="dashdot",
),
name='Mean',
)
# importing webbrowser python module
import webbrowser
#Assigning URL to be opened
strURL = "https://www.reuters.com/business/healthcare-pharmaceuticals/pfizerbiontech-vaccine-appears-effective-against-mutation-new-coronavirus-2021-01-08/"
#Open url in default browser
webbrowser.open(strURL, new=2)
True
data_n=data[data['sentiment']=='Negative']
import datetime
b_date_count_n = data_n.groupby(by='date').count().reset_index()
b_date_count_n = b_date_count_n.rename(columns={'id':'Negative tweets per day'})
fig = ex.line(b_date_count_n,x='date',y='Negative tweets per day')
fig.update_traces(mode="markers+lines")
fig.update_layout(hovermode="x unified")
fig.add_shape(type="line",
x0=b_date_count_n['date'].values[0], y0=b_date_count_n['Negative Sentiment'].mean(), x1=b_date_count_n['date'].values[-1], y1=b_date_count_n['Negative Sentiment'].mean(),
line=dict(
color="Red",
width=2,
dash="dashdot",
),
name='Mean',
)
data_t=data[data['sentiment']=='Neutral']
import datetime
b_date_count_t = data_p.groupby(by='date').count().reset_index()
b_date_count_t = b_date_count_t.rename(columns={'id':'Neutral tweets per day'})
fig = ex.line(b_date_count_t,x='date',y='Neutral tweets per day')
fig.update_traces(mode="markers+lines")
fig.update_layout(hovermode="x unified")
fig.add_shape(type="line",
x0=b_date_count_t['date'].values[0], y0=b_date_count_t['Neutral Sentiment'].mean(), x1=b_date_count_t['date'].values[-1], y1=b_date_count_t['Neutral Sentiment'].mean(),
line=dict(
color="Red",
width=2,
dash="dashdot",
),
name='Mean',
)